Loading libraries

library(scran)
library(scater)
library(HDF5Array)
library(ggplot2)
library(Matrix)
library(pheatmap)
library(RColorBrewer)
library(uwot)
library(grid)
library(gridExtra)
library(GEDI)

Directories

dir_data<- "../data_objects/"
dir_data_hdf5<- paste0(dir_data, "COVID19_SCE/")

Functions

#' Plot 2D embedding 
#'
#' Plot a 2D representation (embedding) of cells
#' @param embedding_mat Embedding
#' @param colour vector of variable to plot
#' @param randomize Logical. Whether to randomize data before plotting.
#'
#' @return ggplot2 object
#' @export
#'
plot_embedding <- function(embedding_mat,colour,randomize=T, size_point=0.05) {
  # create a data frame that will have the embedding as well as the colors
  embedding_obj <- data.frame(
    Dim1=embedding_mat[,1],
    Dim2=embedding_mat[,2],
    Var=colour )

  # randomize the order of the objects
  if( randomize ) {
    embedding_obj <- embedding_obj[ sample.int(nrow(embedding_obj)), ]
  }
  # create the plots
  if(is.numeric(colour)) # the color variable is numeric
  {
    #embedding_obj$Var <- embedding_obj$Var - mean(embedding_obj$Var)
    lim <- stats::quantile(abs(embedding_obj$Var),0.99)
    ggplot2::ggplot(
      embedding_obj, ggplot2::aes_string( x="Dim1", y="Dim2", colour="Var"))+
      ggplot2::geom_point(size=size_point)+
      ggplot2::theme_minimal()+
      ggplot2::scale_color_gradientn( limits=c(-lim,lim), colours=c("blue","light grey","red"), oob=scales::squish )
  } else {
    ggplot2::ggplot(
      embedding_obj, ggplot2::aes_string( x="Dim1", y="Dim2", colour="Var"))+
      ggplot2::geom_point(size=size_point)+
      ggplot2::theme_minimal()+
      ggplot2::guides(colour=ggplot2::guide_legend(override.aes=list(size=3)))

  }
}

summary_markers<- function(markers_sce){
    lis_markers<- lapply(names(markers_sce), function(cluster_look) {
        temp_df<- markers_sce[[cluster_look]]
        temp_df$Gene<- rownames(temp_df)
        temp_df$cluster<- cluster_look
        # Checking if an FDR is equal to 0
        if( any(temp_df$FDR == 0) ){
            min_pvalue<- sort(unique(temp_df$FDR))[2] # take the second lowest pvalue
            cat("pvalues equal to 0 have been changed to:", min_pvalue, "\n")
            temp_df$FDR[temp_df$FDR == 0] <- min_pvalue # setting lower bound
        }
        temp_df$neglog_FDR<- -log10(temp_df$FDR)
        temp_df<- temp_df[sort(rownames(temp_df)),]
        data.frame(temp_df[,c("Gene", "cluster", "p.value", "FDR", "neglog_FDR")])
    })
    names(lis_markers)<- names(markers_sce)    
    return(lis_markers)    
}

pheatmap.colorsymmetric <- function(x,lim=NULL,...)
{
    require(pheatmap)
    if(is.null(lim) ){
        lim <- max(abs(x), na.rm=TRUE)
    }
    if( min(x, na.rm=TRUE) < 0 ){
        lim_down<- -lim
        col_palette<- colorRampPalette(c("blue","white","red"))(256)
    }else{        
        lim_down<- 0
        col_palette<- colorRampPalette(c("white","red"))(256)
    }
    pheatmap(
        x, color = col_palette,
        breaks=seq(lim_down,lim,length.out=255), ... )
}

Loading data

# Reading SCE object
sce<- loadHDF5SummarizedExperiment(dir=dir_data_hdf5)

Loading GEDI model

meta<- data.frame(colData(sce))

# Reading GEDI model 
model<- readRDS(paste0(dir_data, "COVID19_gedi_model_cohort1_TF.rds"))
    
# reorder meta based on GEDI order
meta<- meta[model$aux$cellIDs,]

# Get activities per cell
ADB<- getADB.gedi(model)

# Get ZDB
ZDB<- getZDB.gedi(model)

# Get the gradient for all TFs
gradients <- getActivityGradients.gedi( model )

# Now, retrieve the differential gene expression per cell ( severe vs control)
t( model$aux$inputH)
            (Intercept) group_per_samplemild group_per_samplesevere
C19-CB-0001           1                    1                      0
C19-CB-0003           1                    1                      0
C19-CB-0002           1                    1                      0
C19-CB-0005           1                    1                      0
C19-CB-0009           1                    0                      1
C19-CB-0012           1                    0                      1
C19-CB-0013           1                    0                      1
C19-CB-0011           1                    0                      1
C19-CB-0008           1                    0                      1
C19-CB-0020           1                    0                      1
C19-CB-0021           1                    0                      1
C19-CB-0016           1                    0                      1
C19-CB-0198           1                    0                      1
C19-CB-0204           1                    1                      0
C19-CB-0199           1                    0                      1
C19-CB-0214           1                    1                      0
C19-CB-0053           1                    1                      0
C19-CB-0052           1                    1                      0
P18F                  1                    0                      0
P17H                  1                    0                      0
P20H                  1                    0                      0
P15F                  1                    0                      0
P08H                  1                    0                      0
P13H                  1                    0                      0
P07H                  1                    0                      0
P06F                  1                    0                      0
P04H                  1                    0                      0
C2P01H                1                    0                      0
P09H                  1                    0                      0
P02H                  1                    0                      0
C2P05F                1                    0                      0
C2P07H                1                    0                      0
C2P13F                1                    0                      0
C2P16H                1                    0                      0
C2P10H                1                    0                      0
C2P19H                1                    0                      0
C2P15H                1                    0                      0
one_k_v3              1                    0                      0
Five_k_v3             1                    0                      0
Ten_k_v3              1                    0                      0
DiffExp <- getDiffExp.gedi( model, c(0,0,1) )
meta$velocity_severe<- colSums(DiffExp^2)

Estimate Dot Product and Cosine Similarity

dotprod <- crossprod(DiffExp,gradients) # to get cosine similarity, first calculate dot product

cosineSim <- dotprod / sqrt(colSums(DiffExp^2)) # then, divide by the length of the expression vectors
cosineSim <- t( t(cosineSim) / sqrt(colSums(gradients^2)) ) # and also divide by the length of the

# The gradient vector of each TF is first normalized to have a length of one (by dividing by the Euclidean length of the vector)
gradients_norm<- scale(gradients, center=FALSE, scale=apply(gradients, 2, norm, type="2") )

dotprod_norm <- crossprod(DiffExp,gradients_norm) # to get cosine similarity, first calculate dot product

TF gradient

# Choosing TF
tf<- "SPI1"
set.seed(43)

## TF gradient

C<- model$aux$inputC # Get input C matrix

Cindex <- which(colnames(C)==tf)
vectorField <- svd.joint_vectorField_gradient.gedi(
  model, start.cond = c(1,0,0), end.cond = c(1,0,1), Cindex, scale_cond_vector = 0.5  )
Gradient vectors will be scaled by a factor of 0.000394667923765283.
# Euclidean distance
umap_vectorField <- umap(
  vectorField$v %*% diag(vectorField$d), min_dist=0.5,
  metric="euclidean")
## Cell type embedding indices
ggp<- plot_embedding( umap_vectorField[vectorField$embedding_indices,], meta$id.celltype) +
    theme_void() +
    theme(legend.position ="none")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.

Warning: Please use tidy evaluation idioms with `aes()`.

Warning: See also `vignette("ggplot2-in-packages")` for more information.
ggp

## Saving the colors
g <- ggplot_build(ggp)

df<- g$data[[1]]
df<- unique(df[,c("group", "colour")])
df<- df[order(df$group),]
temp_vec<- levels(meta$id.celltype)
temp_vec<- temp_vec[temp_vec %in% unique(meta$id.celltype)]
df$celltype<- temp_vec

vec_colors<- df$colour
names(vec_colors)<- df$celltype


ggp<- plot_embedding( umap_vectorField[vectorField$embedding_indices,], meta$id.celltype) +
    theme_void() +
    theme(legend.position ="right")

legend <- cowplot::get_legend(ggp)

grid.newpage()
grid.draw(legend)

## covid vector field with the speed 
ggp<- plot_vectorField( umap_vectorField[vectorField$vectorField_indices,], meta$velocity, minNum=15 ) +
    theme_void() +
    labs(title="Vector field of severe COVID-19") +
    theme(legend.position ="right")

ggp

## TF gradient with TF activity
ggp<- plot_vectorField( umap_vectorField[vectorField$gradient_indices,], ADB[tf,], minNum=15 ) +
    theme_void() +
    labs(title=paste0("TF activity:", tf)) +
    theme(legend.position ="right")

ggp

## UMAP plot with TF activity
ggp<- plot_embedding( umap_vectorField[vectorField$embedding_indices,], ADB[tf,]) +
    theme_void() +
    labs(title=paste0("TF activity:", tf)) +    
    theme(legend.position="right")

ggp

sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2020/Core/imkl/2020.1.217/compilers_and_libraries_2020.1.217/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so

locale:
 [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
 [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
 [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] grid      parallel  stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] GEDI_0.0.0.9000             gridExtra_2.3              
 [3] uwot_0.1.10                 RColorBrewer_1.1-2         
 [5] pheatmap_1.0.12             HDF5Array_1.18.1           
 [7] rhdf5_2.34.0                DelayedArray_0.16.3        
 [9] Matrix_1.3-3                scater_1.18.6              
[11] ggplot2_3.4.2               scran_1.18.6               
[13] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
[15] Biobase_2.50.0              GenomicRanges_1.42.0       
[17] GenomeInfoDb_1.26.7         IRanges_2.24.1             
[19] S4Vectors_0.28.1            BiocGenerics_0.36.0        
[21] MatrixGenerics_1.2.1        matrixStats_0.58.0         

loaded via a namespace (and not attached):
 [1] bitops_1.0-6              RcppAnnoy_0.0.18         
 [3] backports_1.2.1           tools_4.0.0              
 [5] bslib_0.2.4               utf8_1.2.1               
 [7] R6_2.5.0                  irlba_2.3.3              
 [9] vipor_0.4.5               colorspace_2.0-0         
[11] rhdf5filters_1.2.0        withr_2.5.0              
[13] tidyselect_1.2.0          compiler_4.0.0           
[15] cli_3.6.1                 BiocNeighbors_1.8.2      
[17] labeling_0.4.2            sass_0.3.1               
[19] checkmate_2.1.0           scales_1.2.1             
[21] metR_0.13.0               stringr_1.5.0            
[23] digest_0.6.27             rmarkdown_2.7            
[25] XVector_0.30.0            pkgconfig_2.0.3          
[27] htmltools_0.5.1.1         sparseMatrixStats_1.2.1  
[29] highr_0.8                 fastmap_1.1.0            
[31] limma_3.46.0              rlang_1.1.0              
[33] DelayedMatrixStats_1.12.3 farver_2.1.0             
[35] jquerylib_0.1.3           generics_0.1.3           
[37] jsonlite_1.8.4            BiocParallel_1.24.1      
[39] dplyr_1.1.1               RCurl_1.98-1.3           
[41] magrittr_2.0.3            BiocSingular_1.6.0       
[43] GenomeInfoDbData_1.2.4    scuttle_1.0.4            
[45] Rcpp_1.0.8.3              ggbeeswarm_0.6.0         
[47] munsell_0.5.0             Rhdf5lib_1.12.1          
[49] fansi_0.4.2               viridis_0.5.1            
[51] lifecycle_1.0.3           stringi_1.5.3            
[53] yaml_2.2.1                edgeR_3.32.1             
[55] zlibbioc_1.36.0           plyr_1.8.6               
[57] dqrng_0.2.1               lattice_0.20-41          
[59] cowplot_1.1.1             beachmat_2.6.4           
[61] locfit_1.5-9.4            knitr_1.31               
[63] pillar_1.8.1              igraph_1.3.4             
[65] codetools_0.2-16          glue_1.6.2               
[67] evaluate_0.14             data.table_1.14.0        
[69] vctrs_0.6.1               gtable_0.3.0             
[71] cachem_1.0.4              xfun_0.22                
[73] rsvd_1.0.5                RcppEigen_0.3.3.9.1      
[75] RSpectra_0.16-0           viridisLite_0.3.0        
[77] tibble_3.2.1              memoise_2.0.0            
[79] beeswarm_0.3.1            bluster_1.0.0            
[81] statmod_1.4.35           
---
title: "COVID-19 cohort1 TF gradient"
output:
  html_notebook:
    df_print: paged
---

# Loading libraries

```{r, message=FALSE}

library(scran)
library(scater)
library(HDF5Array)
library(ggplot2)
library(Matrix)
library(pheatmap)
library(RColorBrewer)
library(uwot)
library(grid)
library(gridExtra)
library(GEDI)

```

# Directories

```{r}

dir_data<- "../data_objects/"
dir_data_hdf5<- paste0(dir_data, "COVID19_SCE/")

```

# Functions

```{r}

#' Plot 2D embedding 
#'
#' Plot a 2D representation (embedding) of cells
#' @param embedding_mat Embedding
#' @param colour vector of variable to plot
#' @param randomize Logical. Whether to randomize data before plotting.
#'
#' @return ggplot2 object
#' @export
#'
plot_embedding <- function(embedding_mat,colour,randomize=T, size_point=0.05) {
  # create a data frame that will have the embedding as well as the colors
  embedding_obj <- data.frame(
    Dim1=embedding_mat[,1],
    Dim2=embedding_mat[,2],
    Var=colour )

  # randomize the order of the objects
  if( randomize ) {
    embedding_obj <- embedding_obj[ sample.int(nrow(embedding_obj)), ]
  }
  # create the plots
  if(is.numeric(colour)) # the color variable is numeric
  {
    #embedding_obj$Var <- embedding_obj$Var - mean(embedding_obj$Var)
    lim <- stats::quantile(abs(embedding_obj$Var),0.99)
    ggplot2::ggplot(
      embedding_obj, ggplot2::aes_string( x="Dim1", y="Dim2", colour="Var"))+
      ggplot2::geom_point(size=size_point)+
      ggplot2::theme_minimal()+
      ggplot2::scale_color_gradientn( limits=c(-lim,lim), colours=c("blue","light grey","red"), oob=scales::squish )
  } else {
    ggplot2::ggplot(
      embedding_obj, ggplot2::aes_string( x="Dim1", y="Dim2", colour="Var"))+
      ggplot2::geom_point(size=size_point)+
      ggplot2::theme_minimal()+
      ggplot2::guides(colour=ggplot2::guide_legend(override.aes=list(size=3)))

  }
}

summary_markers<- function(markers_sce){
    lis_markers<- lapply(names(markers_sce), function(cluster_look) {
        temp_df<- markers_sce[[cluster_look]]
        temp_df$Gene<- rownames(temp_df)
        temp_df$cluster<- cluster_look
        # Checking if an FDR is equal to 0
        if( any(temp_df$FDR == 0) ){
            min_pvalue<- sort(unique(temp_df$FDR))[2] # take the second lowest pvalue
            cat("pvalues equal to 0 have been changed to:", min_pvalue, "\n")
            temp_df$FDR[temp_df$FDR == 0] <- min_pvalue # setting lower bound
        }
        temp_df$neglog_FDR<- -log10(temp_df$FDR)
        temp_df<- temp_df[sort(rownames(temp_df)),]
        data.frame(temp_df[,c("Gene", "cluster", "p.value", "FDR", "neglog_FDR")])
    })
    names(lis_markers)<- names(markers_sce)    
    return(lis_markers)    
}

pheatmap.colorsymmetric <- function(x,lim=NULL,...)
{
    require(pheatmap)
    if(is.null(lim) ){
        lim <- max(abs(x), na.rm=TRUE)
    }
    if( min(x, na.rm=TRUE) < 0 ){
        lim_down<- -lim
        col_palette<- colorRampPalette(c("blue","white","red"))(256)
    }else{        
        lim_down<- 0
        col_palette<- colorRampPalette(c("white","red"))(256)
    }
    pheatmap(
        x, color = col_palette,
        breaks=seq(lim_down,lim,length.out=255), ... )
}

```

# Loading data

```{r}

# Reading SCE object
sce<- loadHDF5SummarizedExperiment(dir=dir_data_hdf5)

```

# Loading GEDI model

```{r}

meta<- data.frame(colData(sce))

# Reading GEDI model 
model<- readRDS(paste0(dir_data, "COVID19_gedi_model_cohort1_TF.rds"))
    
# reorder meta based on GEDI order
meta<- meta[model$aux$cellIDs,]

# Get activities per cell
ADB<- getADB.gedi(model)

# Get ZDB
ZDB<- getZDB.gedi(model)

# Get the gradient for all TFs
gradients <- getActivityGradients.gedi( model )

# Now, retrieve the differential gene expression per cell ( severe vs control)
t( model$aux$inputH)

DiffExp <- getDiffExp.gedi( model, c(0,0,1) )
meta$velocity_severe<- colSums(DiffExp^2)


```

Estimate Dot Product and Cosine Similarity


```{r}

dotprod <- crossprod(DiffExp,gradients) # to get cosine similarity, first calculate dot product

cosineSim <- dotprod / sqrt(colSums(DiffExp^2)) # then, divide by the length of the expression vectors
cosineSim <- t( t(cosineSim) / sqrt(colSums(gradients^2)) ) # and also divide by the length of the

# The gradient vector of each TF is first normalized to have a length of one (by dividing by the Euclidean length of the vector)
gradients_norm<- scale(gradients, center=FALSE, scale=apply(gradients, 2, norm, type="2") )

dotprod_norm <- crossprod(DiffExp,gradients_norm) # to get cosine similarity, first calculate dot product


```


## TF gradient

```{r}

# Choosing TF
tf<- "SPI1"

```

```{r}

set.seed(43)

## TF gradient

C<- model$aux$inputC # Get input C matrix

Cindex <- which(colnames(C)==tf)
vectorField <- svd.joint_vectorField_gradient.gedi(
  model, start.cond = c(1,0,0), end.cond = c(1,0,1), Cindex, scale_cond_vector = 0.5  )

# Euclidean distance
umap_vectorField <- umap(
  vectorField$v %*% diag(vectorField$d), min_dist=0.5,
  metric="euclidean")

```

```{r, fig.width=7, fig.height=7}

## Cell type embedding indices
ggp<- plot_embedding( umap_vectorField[vectorField$embedding_indices,], meta$id.celltype) +
    theme_void() +
    theme(legend.position ="none")

ggp

## Saving the colors
g <- ggplot_build(ggp)

df<- g$data[[1]]
df<- unique(df[,c("group", "colour")])
df<- df[order(df$group),]
temp_vec<- levels(meta$id.celltype)
temp_vec<- temp_vec[temp_vec %in% unique(meta$id.celltype)]
df$celltype<- temp_vec

vec_colors<- df$colour
names(vec_colors)<- df$celltype


ggp<- plot_embedding( umap_vectorField[vectorField$embedding_indices,], meta$id.celltype) +
    theme_void() +
    theme(legend.position ="right")

legend <- cowplot::get_legend(ggp)

grid.newpage()
grid.draw(legend)


## covid vector field with the speed 
ggp<- plot_vectorField( umap_vectorField[vectorField$vectorField_indices,], meta$velocity, minNum=15 ) +
    theme_void() +
    labs(title="Vector field of severe COVID-19") +
    theme(legend.position ="right")

ggp

## TF gradient with TF activity
ggp<- plot_vectorField( umap_vectorField[vectorField$gradient_indices,], ADB[tf,], minNum=15 ) +
    theme_void() +
    labs(title=paste0("TF activity:", tf)) +
    theme(legend.position ="right")

ggp


## UMAP plot with TF activity
ggp<- plot_embedding( umap_vectorField[vectorField$embedding_indices,], ADB[tf,]) +
    theme_void() +
    labs(title=paste0("TF activity:", tf)) +    
    theme(legend.position="right")

ggp

```



```{r}

sessionInfo()

```
